#include "pdsp_defs.h"

void
*pdgstrf_thread(void *arg)
{
/*
 * -- SuperLU MT routine (version 1.0) --
 * Univ. of California Berkeley, Xerox Palo Alto Research Center,
 * and Lawrence Berkeley National Lab.
 * August 15, 1997
 *
 * Purpose
 * =======
 *
 * This is the slave process, representing the main scheduling loop to
 * perform the factorization. Each process executes a copy of the
 * following code ... (SPMD paradigm)
 *
 * Working arrays local to each process
 * ======================================
 *   marker[0:3*m-1]: marker[i] == j means node i has been reached when 
 *                                 working on column j.
 *	Storage: relative to original row subscripts
 *
 *	THERE ARE 3 OF THEM:
 *          marker[0 : m-1]:   used by pdgstrf_factor_snode() and 
 *                                     pdgstrf_panel_dfs();
 *          marker[m : 2m-1]:  used by pdgstrf_panel_dfs() and 
 *                                     pxgstrf_super_bnd_dfs();
 *                values in [0 : n-1]  when used by pdgstrf_panel_dfs()
 *                values in [n : 2n-1] when used by pxgstrf_super_bnd_dfs()
 *	    marker[2m : 3m-1]: used by pdgstrf_column_dfs() in inner-factor 
 *
 *   parent[0:n-1]: parent vector used during dfs
 *      Storage: relative to new row subscripts
 *
 *   xplore[0:2m-1]: xplore[i] gives the location of the next (dfs) 
 *	unexplored neighbor of i in lsub[*]; xplore[n+i] gives the
 *      location of the last unexplored neighbor of i in lsub[*].
 *
 *   segrep[0:nseg-1]: contains the list of supernodal representatives
 *	in topological order of the dfs. A supernode representative is the 
 *	last column of a supernode.
 *
 *   repfnz[0:m-1]: for a nonzero segment U[*,j] that ends at a 
 *	supernodal representative r, repfnz[r] is the location of the first 
 *	nonzero in this segment.  It is also used during the dfs:
 *      repfnz[r]>0 indicates that supernode r has been explored.
 *	NOTE: There are w of them, each used for one column of a panel. 
 *
 *   panel_lsub[0:w*m-1]: temporary for the nonzero row indices below 
 *      the panel diagonal. These are filled in during pdgstrf_panel_dfs(), 
 *      and are used later in the inner LU factorization.
 *	panel_lsub[]/dense[] pair forms the SPA data structure.
 *	NOTE: There are w of them.
 *
 *   dense[0:w*m-1]: sparse accumulator (SPA) for intermediate values;
 *	NOTE: there are w of them.
 *
 *   tempv[0:m-1]: real temporary used for dense numeric kernels;
 *
 * 
 * Scheduling algorithm (For each process ...)
 * ====================
 *     Shared task Q <-- { relaxed s-nodes (CANGO) };
 *
 *     WHILE (not finished)
 *
 *         panel = Scheduler(Q); (see pxgstrf_scheduler.c for policy)
 *
 *         IF (panel == RELAXED_SNODE)
 *             factor_relax_snode(panel);
 *         ELSE
 *             * pdgstrf_panel_dfs()
 *                 - skip all BUSY s-nodes (or panels)
 *
 *             * dpanel_bmod()
 *                 - updates from DONE s-nodes
 *                 - wait for BUSY s-nodes to become DONE
 *
 *             * inner-factor()
 *                 - identical as it is in the sequential algorithm,
 *                   except that pruning() will interact with the
 *                   pdgstrf_panel_dfs() of other panels.
 *         ENDIF
 *
 *     END WHILE;
 *
 */

#if ( MACH==SGI || MACH==ORIGIN )
#if ( MACH==SGI )
    int         pnum = mpc_my_threadnum();
#else ( MACH==ORIGIN )
    int         pnum = mp_my_threadnum();
#endif
    pdgstrf_threadarg_t *thr_arg = &((pdgstrf_threadarg_t *)arg)[pnum];
#else
    pdgstrf_threadarg_t *thr_arg  = arg;
    int         pnum = thr_arg->pnum;
#endif

    /* Unpack the options argument */
    pdgstrf_options_t *pdgstrf_options= thr_arg->pdgstrf_options;
    pxgstrf_shared_t  *pxgstrf_shared= thr_arg->pxgstrf_shared;
    int         panel_size = pdgstrf_options->panel_size;
    double      diag_pivot_thresh = pdgstrf_options->diag_pivot_thresh;
    yes_no_t    *usepr     = &pdgstrf_options->usepr; /* may be modified */
    int         *etree     = pdgstrf_options->etree;
    int         *super_bnd = pdgstrf_options->part_super_h;
    int         *perm_r    = pdgstrf_options->perm_r;
    int         *inv_perm_c= pxgstrf_shared->inv_perm_c;
    int         *inv_perm_r= pxgstrf_shared->inv_perm_r;
    int	        *xprune    = pxgstrf_shared->xprune;
    int	        *ispruned  = pxgstrf_shared->ispruned;
    SuperMatrix *A         = pxgstrf_shared->A;
    GlobalLU_t  *Glu       = pxgstrf_shared->Glu;
    Gstat_t 	*Gstat     = pxgstrf_shared->Gstat;
    int         *info      = &thr_arg->info;

    /* Local working arrays */
    int       *iwork;
    double    *dwork;
    int	      *segrep, *repfnz, *parent, *xplore;
    int	      *panel_lsub; /* dense[]/panel_lsub[] pair forms a w-wide SPA */
    int	      *marker, *marker1, *marker2;
    int       *lbusy; /* "Local busy" array, indicates which descendants
			 were busy when this panel's computation began.
			 Those columns (s-nodes) are treated specially
			 during pdgstrf_panel_dfs() and dpanel_bmod(). */

    int       *spa_marker; /* size n-by-w */
    int       *w_lsub_end; /* record the end of each column in panel_lsub */
    double    *dense, *tempv;
    int       *lsub, *xlsub, *xlsub_end;

    /* Local scalars */
    register int m, n, k, jj, jcolm1, itemp, singular;
    int       pivrow;   /* pivotal row number in the original matrix A */
    int       nseg1;	/* no of segments in U-column above panel row jcol */
    int       nseg;	/* no of segments in each U-column */
    int       w, bcol, jcol;

#ifdef PROFILE
    double *utime = Gstat->utime;
    double t1, t2, t, stime;
    register float flopcnt;
#endif

#ifdef PREDICT_OPT
    flops_t  *ops = Gstat->ops;
    register float pdiv;
#endif
    
#if ( DEBUGlevel>=1 )
    printf("(%d) thr_arg-> pnum %d, info %d\n", pnum, thr_arg->pnum, thr_arg->info);
#endif

    singular   = 0;
    m          = A->nrow;
    n          = A->ncol;
    lsub       = Glu->lsub;
    xlsub      = Glu->xlsub;
    xlsub_end  = Glu->xlsub_end;

    /* Allocate and initialize the per-process working storage. */
    if ( (*info = pdgstrf_WorkInit(m, panel_size, &iwork, &dwork)) ) {
	*info += pdgstrf_memory_use(Glu->nzlmax, Glu->nzumax, Glu->nzlumax);
	return 0;
    }
    pxgstrf_SetIWork(m, panel_size, iwork, &segrep, &parent, &xplore,
	     &repfnz, &panel_lsub, &marker, &lbusy);
    pdgstrf_SetRWork(m, panel_size, dwork, &dense, &tempv);
    
    /* New data structures to facilitate parallel algorithm */
    spa_marker = intMalloc(m * panel_size);
    w_lsub_end = intMalloc(panel_size);
    ifill (spa_marker, m * panel_size, EMPTY);
    ifill (marker, m * NO_MARKER, EMPTY);
    ifill (lbusy, m, EMPTY);
    jcol = EMPTY;
    marker1 = marker + m;
    marker2 = marker + 2*m;

#ifdef PROFILE    
    stime = SuperLU_timer_();
#endif

    /* -------------------------
       Main loop: repeatedly ...
       ------------------------- */
    while ( pxgstrf_shared->tasks_remain > 0 ) {
        
#ifdef PROFILE
	TIC(t);
#endif
	/* Get a panel from the scheduler. */
	pxgstrf_scheduler(pnum, n, etree, &jcol, &bcol, pxgstrf_shared);

#if ( DEBUGlevel>=1 )
    if ( jcol>=LOCOL && jcol<=HICOL ) {
	printf("(%d) Scheduler(): jcol %d, bcol %d, tasks_remain %d\n", 
	       pnum, jcol, bcol, pxgstrf_shared->tasks_remain);
	fflush(stdout);
    }
#endif

#ifdef PROFILE	    
	TOC(t2, t);
	Gstat->procstat[pnum].skedtime += t2;	    
#endif
	    
	if ( jcol != EMPTY ) {
	    w = pxgstrf_shared->pan_status[jcol].size;

#if ( DEBUGlevel>=3 )
	    printf("P%2d got panel %5d-%5d\ttime %.4f\tpanels_left %d\n",
		   pnum, jcol, jcol+w-1, SuperLU_timer_(), 
		   pxgstrf_shared->tasks_remain);
	    fflush(stdout); 
#endif
	    /* Nondomain panels */
#ifdef PROFILE
	    flopcnt = Gstat->procstat[pnum].fcops;
	    panstat[jcol].pnum = pnum;
	    TIC(t1);
	    panstat[jcol].starttime = t1;
#endif
	    if ( pxgstrf_shared->pan_status[jcol].type == RELAXED_SNODE ) {
		
#ifdef PREDICT_OPT
		pdiv = Gstat->procstat[pnum].fcops;
#endif
		/* A relaxed supernode at the bottom of the etree */
		pdgstrf_factor_snode
		    (pnum, jcol, A, diag_pivot_thresh, usepr,
		     perm_r, inv_perm_r, inv_perm_c, xprune, marker,
		     panel_lsub, dense, tempv, pxgstrf_shared, info);
		if ( *info ) {
		    if ( *info > n ) return 0;
		    else if ( singular == 0 || *info < singular ) 
		        singular = *info;
#if ( DEBUGlevel>=1 )
    printf("(%d) After pdgstrf_factor_snode(): singular=%d\n", pnum, singular);
#endif
		}

		/* Release the whole relaxed supernode */
		for (jj = jcol; jj < jcol + w; ++jj) 
		    pxgstrf_shared->spin_locks[jj] = 0;
#ifdef PREDICT_OPT
		pdiv = Gstat->procstat[pnum].fcops - pdiv;
		cp_panel[jcol].pdiv = pdiv;
#endif
	    } else { /* Regular panel */
#ifdef PROFILE
		TIC(t);
#endif
		pxgstrf_mark_busy_descends(pnum, jcol, etree, pxgstrf_shared, 
					   &bcol, lbusy);
		
		/* Symbolic factor on a panel of columns */
		pdgstrf_panel_dfs
		    (pnum, m, w, jcol, A, perm_r, xprune,ispruned,lbusy,
		     &nseg1, panel_lsub, w_lsub_end, segrep, repfnz,
		     marker, spa_marker, parent, xplore, dense, Glu);
#if ( DEBUGlevel>=2 )
  if ( jcol==BADPAN )
    printf("(%d) After pdgstrf_panel_dfs(): nseg1 %d, w_lsub_end %d\n",
	   pnum, nseg1, w_lsub_end[0]);
#endif
#ifdef PROFILE
		TOC(t2, t);
		utime[DFS] += t2;
#endif
		/* Numeric sup-panel updates in topological order.
		 * On return, the update values are temporarily stored in 
		 * the n-by-w SPA dense[m,w].
		 */
		pdgstrf_panel_bmod
		    (pnum, m, w, jcol, bcol, inv_perm_r, etree,
		     &nseg1, segrep, repfnz, panel_lsub, w_lsub_end,
		     spa_marker, dense, tempv, pxgstrf_shared);

		/*
		 * All "busy" descendants are "done" now --
		 * Find the set of row subscripts in the preceeding column
		 * "jcol-1" of the current panel. Column "jcol-1" is
		 * usually taken by a process other than myself.
		 * This row-subscripts information will be used by myself
		 * during column dfs to detect whether "jcol" belongs
		 * to the same supernode as "jcol-1".
		 * 
		 * ACCORDING TO PROFILE, THE AMOUNT OF TIME SPENT HERE 
		 * IS NEGLIGIBLE.
		 */
		jcolm1 = jcol - 1;
		itemp = xlsub_end[jcolm1];
		for (k = xlsub[jcolm1]; k < itemp; ++k)
		    marker2[lsub[k]] = jcolm1;
#ifdef PREDICT_OPT
		pdiv = Gstat->procstat[pnum].fcops;
#endif
		/* Inner-factorization, using sup-col algorithm */
		for ( jj = jcol; jj < jcol + w; jj++) {
		    k = (jj - jcol) * m; /* index into w-wide arrays */
		    nseg = nseg1; /* begin after all the panel segments */
#ifdef PROFILE
		    TIC(t);
#endif
		    /* Allocate storage for the current H-supernode. */
		    if ( Glu->dynamic_snode_bound && super_bnd[jj] ) {
		        /* jj starts a supernode in H */
			pxgstrf_super_bnd_dfs
			    (pnum, m, n, jj, super_bnd[jj], A, perm_r, 
			     inv_perm_r, xprune, ispruned, marker1, parent, 
			     xplore, pxgstrf_shared);
		    }
		    
		    if ( (*info = pdgstrf_column_dfs
			            (pnum, m, jj, jcol, perm_r, ispruned,
				     &panel_lsub[k],w_lsub_end[jj-jcol],
				     super_bnd, &nseg, segrep,
				     &repfnz[k], xprune, marker2,
				     parent, xplore, pxgstrf_shared)) )
			return 0;
#ifdef PROFILE
		    TOC(t2, t);
		    utime[DFS] += t2;
#endif
		    /* On return, the L supernode is gathered into the
		       global storage. */
		    if ( (*info = pdgstrf_column_bmod
			          (pnum, jj, jcol, (nseg - nseg1),
				   &segrep[nseg1], &repfnz[k],
				   &dense[k], tempv, pxgstrf_shared, Gstat)) )
			return 0;
		
		    if ( (*info = pdgstrf_pivotL
			            (pnum, jj, diag_pivot_thresh, usepr,
				     perm_r, inv_perm_r, inv_perm_c,
				     &pivrow, Glu, Gstat)) )
			if ( singular == 0 || *info < singular ) {
			    singular = *info;
#if ( DEBUGlevel>=1 )
    printf("(%d) After pdgstrf_pivotL(): singular=%d\n", pnum, singular);
#endif
			}

                    /* release column "jj", so that the other processes
                       waiting for this column can proceed */
		    pxgstrf_shared->spin_locks[jj] = 0;
		    
		    /* copy the U-segments to ucol[*] */
		    if ( (*info = pdgstrf_copy_to_ucol
			            (pnum,jj,nseg,segrep,&repfnz[k],
				     perm_r, &dense[k], pxgstrf_shared)) )
		      return 0;

		    /* Prune columns [0:jj-1] using column jj */
		    pxgstrf_pruneL(jj, perm_r, pivrow, nseg, segrep,
				   &repfnz[k], xprune, ispruned, Glu);

		    /* Reset repfnz[] for this column */
		    pxgstrf_resetrep_col (nseg, segrep, &repfnz[k]);

#if ( DEBUGlevel>=2 )
/*  if (jj >= LOCOL && jj <= HICOL) {*/
  if ( jj==BADCOL ) {
    dprint_lu_col(pnum, "panel:", jcol, jj, w, pivrow, xprune, Glu);
    dcheck_zero_vec(pnum, "after pdgstrf_copy_to_ucol() dense_col[]", n, &dense[k]);
  }
#endif
		} /* for jj ... */
		
#ifdef PREDICT_OPT
		pdiv = Gstat->procstat[pnum].fcops - pdiv;
		cp_panel[jcol].pdiv = pdiv;
#endif
		
	    } /* else regular panel ... */
	    
	    STATE( jcol ) = DONE; /* Release panel jcol. */
	    
#ifdef PROFILE
	    TOC(panstat[jcol].fctime, t1);
	    panstat[jcol].flopcnt += Gstat->procstat[pnum].fcops - flopcnt;
	    /*if ( Glu->tasks_remain < P ) {
		flops_last_P_panels += panstat[jcol].flopcnt;
		printf("Panel %d, flops %e\n", jcol, panstat[jcol].flopcnt);
		fflush(stdout);
	    } */
#endif

	}
#ifdef PROFILE
	else { /* No panel from the task queue - wait and try again */
	    Gstat->procstat[pnum].skedwaits++;
	}
#endif
	
    } /* while there are more panels */

    *info = singular;

    /* Free work space and compress storage */
    pdgstrf_WorkFree(iwork, dwork, Glu);
    SUPERLU_FREE (spa_marker);
    SUPERLU_FREE (w_lsub_end);

#ifdef PROFILE
    Gstat->procstat[pnum].fctime = SuperLU_timer_() - stime;
#endif

    return 0;
}

